{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# **YouTube Video RAG**\n",
        "\n",
        "Authored by [Kalyan KS](https://www.linkedin.com/in/kalyanksnlp/). To stay updated with LLM, RAG and Agent updates, you can follow me on [Twitter](https://x.com/kalyan_kpl).\n",
        "\n",
        "- Step-1 : Extract the YouTube video transcript\n",
        "- Step-2 : Chunk the extracted transcript text\n",
        "- Step-3 : Create a vector store with the transcript chunks\n",
        "- Step-4 : Create a retriever which will return the relevant chunks\n",
        "- Step-5 : Build context from the relevant chunk texts\n",
        "- Step-6 : Build the RAG chain using rag prompt, LLM and string output parser.\n",
        "- Step-7 : Run the RAG chain to get the answer."
      ],
      "metadata": {
        "id": "bkvZp-8oJxfb"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Install and import libraries**\n",
        "\n",
        "- YoutubeLoader uses `youtube-transcript-api` python library to extract the transcript."
      ],
      "metadata": {
        "id": "WX1Zty3aK8dF"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -qU langchain langchain-community langchain-text-splitters\n",
        "!pip install -qU langchain-openai langchain-chroma youtube-transcript-api"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ieCqdSLiK-3V",
        "outputId": "ffbe40ea-37e4-4a20-ee86-59c9443369c6"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/2.5 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.5/2.5 MB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m38.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.9/50.9 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.9/54.9 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m622.3/622.3 kB\u001b[0m \u001b[31m20.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m611.1/611.1 kB\u001b[0m \u001b[31m39.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m73.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.6/278.6 kB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m68.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.6/101.6 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.3/13.3 MB\u001b[0m \u001b[31m85.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.9/55.9 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.4/177.4 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.0/65.0 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m118.7/118.7 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.0/73.0 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m459.8/459.8 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.5/71.5 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m26.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m452.6/452.6 kB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain_community.document_loaders import YoutubeLoader\n",
        "from langchain_text_splitters  import RecursiveCharacterTextSplitter\n",
        "from langchain_chroma import Chroma\n",
        "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
        "from langchain_core.prompts import ChatPromptTemplate\n",
        "from langchain_core.output_parsers import StrOutputParser\n",
        "from langchain_core.runnables import RunnableLambda"
      ],
      "metadata": {
        "id": "TXetwTFYLA8X"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Set up LLM API Key**\n",
        "\n",
        "- Save the `OPENAI_API_KEY` in Google Colab Secrets"
      ],
      "metadata": {
        "id": "bg-Xr6WgLBVj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import userdata\n",
        "import os\n",
        "os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')"
      ],
      "metadata": {
        "id": "pnsVEcU1LFwZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Extract YouTube video transcript**"
      ],
      "metadata": {
        "id": "0MwnhbIrLIHf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from typing import List\n",
        "from langchain.schema import Document\n",
        "\n",
        "def yt_transcript(video_url: str) -> List[Document]:\n",
        "    \"\"\"\n",
        "    Extracts transcript of given YouTube video using YoutubeLoader.\n",
        "\n",
        "    Parameters:\n",
        "    video_url (str): The URL of the YouTube video.\n",
        "\n",
        "    Returns:\n",
        "    List[Document]: A list of Document objects containing the transcript.\n",
        "    \"\"\"\n",
        "\n",
        "    print(\"YouTube video transcript is extracted...\")\n",
        "    loader = YoutubeLoader.from_youtube_url(video_url)\n",
        "    transcript = loader.load()\n",
        "\n",
        "    return transcript"
      ],
      "metadata": {
        "id": "HBpkh5fALREj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "video_url = \"https://www.youtube.com/watch?v=d4IyR-kl_mY\"\n",
        "transcript = yt_transcript(video_url)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vnHyVr2HQeJa",
        "outputId": "d01bfb85-1b61-4fd3-f250-56c744539a55"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "YouTube video transcript is extracted...\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(transcript)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Y_fb-lCUap4X",
        "outputId": "8c8ecfcc-8d05-4f61-bafe-8c85de206c36"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[Document(metadata={'source': 'd4IyR-kl_mY'}, page_content=\"a passenger fery service from nagapattinam in Tamil Nadu to kesan Tay in jafna is being resumed after several months this has revived the potential for strengthening cultural and economic ties between India and Sri Lanka what is this fairy service all [Music] about welome this is T sures Kumar thank you for joining me in this episode of the Focus Tamil Nadu on August 16th shivaganga a passenger faery service will set sail from the nagap patnam port and it will reach kesay or kks port in about 4 hours this fairy service has a seating occupancy of 150 including 27 Premier seating the two coastal towns have historically shared a very close cultural ties in ancient times nagan Nadu that is the present day nagapattinam referred only to Sri Lanka likewise kesan derived its name after the Hindu dat mugan or ktia Kang is famed for his beaches and temples it has two historic temples the kirim Malai nageswaran Temple and the muram kandaswami temple India and Sri Lanka have historically shared a very good Maritime ties however the Civil War in the country uh between the ethnic tamils and the salies that lasted for 25 years years had disrupted the movement of goods and passengers on its traditional sea routs these two routs included one from tal Mana to ramesham and the other one from toi to Columbo after the end of the Civil War in Sri Lanka in 2009 they were efforts to revive the maram ties between the two countries in 2011 that is 2 years after the Civil War a passenger fery service was launched between tutuki and Columbo however after fing more than 12,000 passengers this service was suspended subsequently several rounds of talks were held between diplomats of both countries and politicians hoping to revive various sea roads initially the two countries had descided to launch a passenger faery service from caral in the union territory of puducheri to KES and turai however subsequently it was decided to have the fer service from nagapattinam to kks Port last year amid's great Fanfare Cher Yani a highspeed craft had set sail from nagap patam to Kay carrying about 50 passengers on board at that time prime minister Narendra Modi had said that this will be a new chapter in diplomatic and economic ties between the two countries he was optimistic that the fery service would help to strengthen the cultural civilizational and Commercial ties between India and Sri Lanka he believed that this marittime connectivity was the central theme of The Joint vision of Indo Sri Lanka economic ties Modi had even said that India will now take steps to revive the sea route between t Manar and rameshwaram the Sri Lankan president ranil Vikram Ming had also hailed this revival of Maritime ties between the two countries however this fery service was shortlived after its initial Journey it was postponed and rescheduled citing weather conditions but eventually the fery service was suspended completely in fact at one point India and Sri Lanka had a very thriving Maritime route with a rail link for close to 50 years in the early part of 19th century they applied a train between Metras and danush Kodi called the Indo silon Express passengers would board the train from medras and reach the danush Kodi P from there they would take a boat service to reach Sri Lanka in fact for this purpose this train the indoo express was also called as the boat mail from the danush Kaya there were two boats ir and goshan that would take passengers to tal Manar in Sri Lanka during the World War II the boat mail was known to Ferry hundreds of troops from India to sone which was then the headquarters of Lord Louis bount batt this boat mail was launched in 1914 after the construction of the historic pan Bridge it was operational until 1964 when a severe cyclonic storm devastated Danish Kodi and left it in ruins the Cyclone had washed away an entire train with passengers and also left the danush Kodi station in Ruins thereafter this service was not resumed against this backdrop now the two countries had have to take additional steps to ensure that the shangai passenger fery Service uh continues to thrive in the coming months and years from the Indian side and on beh of the tamad government steps must be taken to create additional facilities in nagap patnam Port besides the Rail Link to nagap patam needs to be strengthened this will enable people who come from the northern province in Sri Lanka to visit various temples in Tamil Nadu with which they have deep ties including the ch nataja Temple the Sri Lankan side and its policy makers must also take steps to strengthen economic ties with India by encouraging exports from the northern Province earlier this year the Sri Lankan cabinet had cleared a proposal for strengthening and renovating the Congress and Report with uh the Indian government agreeing to take on the entire cost hopefully this will uh move forward and in the coming years we will see a very thriving Maritime route between India and Sri Lanka on that note D surh Kumar signing off thank you\")]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f\"Number of documents = {len(transcript)}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BH6IAjYyWdpG",
        "outputId": "0f20eac7-e535-4cf4-f40c-2b1c86b460ae"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of documents = 1\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Chunk Transcript text**"
      ],
      "metadata": {
        "id": "jLqBvd_JLTb-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def yt_chunk(transcript: List[Document]) -> List[Document]:\n",
        "    \"\"\"\n",
        "    Splits extracted transcript text into smaller chunks using RecursiveCharacterTextSplitter.\n",
        "\n",
        "    Parameters:\n",
        "    transcript (List[Document]): A list of Document objects containing extracted transcript.\n",
        "\n",
        "    Returns:\n",
        "    List[Document]: A list of chunked Document objects.\n",
        "    \"\"\"\n",
        "\n",
        "    print(\"YouTube video transcript text is chunked....\")\n",
        "    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
        "    chunks = text_splitter.split_documents(transcript)\n",
        "\n",
        "    return chunks"
      ],
      "metadata": {
        "id": "uK1aFrhvLWbp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "chunks = yt_chunk(transcript)"
      ],
      "metadata": {
        "id": "TdDmlfxkROZ_",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "00333221-3c13-4a70-f414-960f45839a98"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "PDF file text is chunked....\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f\"Number of chunks = {len(chunks)}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3VzMTYugRRCW",
        "outputId": "229fa654-e32e-4e43-c984-7a1d81009442"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of chunks = 6\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(chunks[0])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "B2wW4EJXXEo6",
        "outputId": "450b1325-ef56-4293-f720-23f949093300"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "page_content='a passenger fery service from nagapattinam in Tamil Nadu to kesan Tay in jafna is being resumed after several months this has revived the potential for strengthening cultural and economic ties between India and Sri Lanka what is this fairy service all [Music] about welome this is T sures Kumar thank you for joining me in this episode of the Focus Tamil Nadu on August 16th shivaganga a passenger faery service will set sail from the nagap patnam port and it will reach kesay or kks port in about 4 hours this fairy service has a seating occupancy of 150 including 27 Premier seating the two coastal towns have historically shared a very close cultural ties in ancient times nagan Nadu that is the present day nagapattinam referred only to Sri Lanka likewise kesan derived its name after the Hindu dat mugan or ktia Kang is famed for his beaches and temples it has two historic temples the kirim Malai nageswaran Temple and the muram kandaswami temple India and Sri Lanka have historically shared a' metadata={'source': 'd4IyR-kl_mY'}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Create Vector Store**"
      ],
      "metadata": {
        "id": "FqAC3V1WLdtW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Set the chroma DB path\n",
        "current_dir = \"/content/rag\"\n",
        "persistent_directory = os.path.join(current_dir, \"db\", \"chroma_db_yt\")"
      ],
      "metadata": {
        "id": "qZ5ls-3uLh8s"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def create_vector_store(chunks: List[Document], db_path: str) -> Chroma:\n",
        "    \"\"\"\n",
        "    Creates a Chroma vector store from chunked documents.\n",
        "\n",
        "    Parameters:\n",
        "    chunks (List[Document]): A list of chunked Document objects.\n",
        "    db_path (str): The directory path to persist the vector store.\n",
        "\n",
        "    Returns:\n",
        "    Chroma: A Chroma vector store containing the embedded documents.\n",
        "    \"\"\"\n",
        "\n",
        "    print(\"Chrome vector store is created...\\n\")\n",
        "\n",
        "    embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
        "    db = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path)\n",
        "\n",
        "    return db"
      ],
      "metadata": {
        "id": "_30SvTV7Lg1K"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "db = create_vector_store(chunks, persistent_directory)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XS8knuW4RiQ8",
        "outputId": "b810cac6-56b5-4aa5-accf-66e8dfe1e964"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Chrome vector store is created...\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Retrieve relevant chunks**"
      ],
      "metadata": {
        "id": "fV2W-AlfLwFX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def retrieve_context(db: Chroma, query: str) -> List[Document]:\n",
        "    \"\"\"\n",
        "    Retrieves relevant document chunks from the Chroma vector store based on a query.\n",
        "\n",
        "    Parameters:\n",
        "    db (Chroma): The Chroma vector store containing embedded documents.\n",
        "    query (str): The query string to search for relevant document chunks.\n",
        "\n",
        "    Returns:\n",
        "    List[Document]: A list of retrieved relevant document chunks.\n",
        "    \"\"\"\n",
        "\n",
        "    retriever = db.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": 2})\n",
        "    print(\"Relevant chunks are retrieved...\\n\")\n",
        "    relevant_chunks = retriever.invoke(query)\n",
        "\n",
        "    return relevant_chunks"
      ],
      "metadata": {
        "id": "7GPAc2iGL3bK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "query = \"passenger fery service is resumed from which place?\"\n",
        "\n",
        "relevant_chunks = retrieve_context(db, query)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "UO-clF3KRrZj",
        "outputId": "2123c00f-abab-45c5-b1dc-7e2e438a108f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Relevant chunks are retrieved...\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f\"Number of relevant chunks = {len(relevant_chunks)}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ytjpbOpaRzxI",
        "outputId": "c84cc8ca-7134-4a5b-eab2-36369ce89ec3"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of relevant chunks = 2\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "for i, chunk in enumerate(relevant_chunks):\n",
        "  print(f\"Chunk-{i}\")\n",
        "  print(chunk)\n",
        "  print(\"\\n\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rk8DZzajR2Ya",
        "outputId": "ee6b954e-0e9e-4c69-dc1b-9d24c45eb255"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Chunk-0\n",
            "page_content='a passenger faery service from caral in the union territory of puducheri to KES and turai however subsequently it was decided to have the fer service from nagapattinam to kks Port last year amid's great Fanfare Cher Yani a highspeed craft had set sail from nagap patam to Kay carrying about 50 passengers on board at that time prime minister Narendra Modi had said that this will be a new chapter in diplomatic and economic ties between the two countries he was optimistic that the fery service would help to strengthen the cultural civilizational and Commercial ties between India and Sri Lanka he believed that this marittime connectivity was the central theme of The Joint vision of Indo Sri Lanka economic ties Modi had even said that India will now take steps to revive the sea route between t Manar and rameshwaram the Sri Lankan president ranil Vikram Ming had also hailed this revival of Maritime ties between the two countries however this fery service was shortlived after its initial' metadata={'source': 'd4IyR-kl_mY'}\n",
            "\n",
            "\n",
            "Chunk-1\n",
            "page_content='a passenger fery service from nagapattinam in Tamil Nadu to kesan Tay in jafna is being resumed after several months this has revived the potential for strengthening cultural and economic ties between India and Sri Lanka what is this fairy service all [Music] about welome this is T sures Kumar thank you for joining me in this episode of the Focus Tamil Nadu on August 16th shivaganga a passenger faery service will set sail from the nagap patnam port and it will reach kesay or kks port in about 4 hours this fairy service has a seating occupancy of 150 including 27 Premier seating the two coastal towns have historically shared a very close cultural ties in ancient times nagan Nadu that is the present day nagapattinam referred only to Sri Lanka likewise kesan derived its name after the Hindu dat mugan or ktia Kang is famed for his beaches and temples it has two historic temples the kirim Malai nageswaran Temple and the muram kandaswami temple India and Sri Lanka have historically shared a' metadata={'source': 'd4IyR-kl_mY'}\n",
            "\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Build context**"
      ],
      "metadata": {
        "id": "BcJbxTaqL7QE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def build_context(relevant_chunks: List[Document]) -> str:\n",
        "    \"\"\"\n",
        "    Builds a context string from retrieved relevant document chunks.\n",
        "\n",
        "    Parameters:\n",
        "    relevant_chunks (List[Document]): A list of retrieved relevant document chunks.\n",
        "\n",
        "    Returns:\n",
        "    str: A concatenated string containing the content of the relevant chunks.\n",
        "    \"\"\"\n",
        "\n",
        "    print(\"Context is built from relevant chunks\")\n",
        "    context = \"\\n\\n\".join([chunk.page_content for chunk in relevant_chunks])\n",
        "\n",
        "    return context"
      ],
      "metadata": {
        "id": "xRZ2HLDxMAcP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "context = build_context(relevant_chunks)"
      ],
      "metadata": {
        "id": "Enka2YvTSULr",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "1c61a530-61ef-4095-f189-30b15f817a6a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Context is built from relevant chunks\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(context)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S3Ndsd-4SWg1",
        "outputId": "da509105-0ce7-4e20-bacb-dd1c12bfda45"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "a passenger faery service from caral in the union territory of puducheri to KES and turai however subsequently it was decided to have the fer service from nagapattinam to kks Port last year amid's great Fanfare Cher Yani a highspeed craft had set sail from nagap patam to Kay carrying about 50 passengers on board at that time prime minister Narendra Modi had said that this will be a new chapter in diplomatic and economic ties between the two countries he was optimistic that the fery service would help to strengthen the cultural civilizational and Commercial ties between India and Sri Lanka he believed that this marittime connectivity was the central theme of The Joint vision of Indo Sri Lanka economic ties Modi had even said that India will now take steps to revive the sea route between t Manar and rameshwaram the Sri Lankan president ranil Vikram Ming had also hailed this revival of Maritime ties between the two countries however this fery service was shortlived after its initial\n",
            "\n",
            "a passenger fery service from nagapattinam in Tamil Nadu to kesan Tay in jafna is being resumed after several months this has revived the potential for strengthening cultural and economic ties between India and Sri Lanka what is this fairy service all [Music] about welome this is T sures Kumar thank you for joining me in this episode of the Focus Tamil Nadu on August 16th shivaganga a passenger faery service will set sail from the nagap patnam port and it will reach kesay or kks port in about 4 hours this fairy service has a seating occupancy of 150 including 27 Premier seating the two coastal towns have historically shared a very close cultural ties in ancient times nagan Nadu that is the present day nagapattinam referred only to Sri Lanka likewise kesan derived its name after the Hindu dat mugan or ktia Kang is famed for his beaches and temples it has two historic temples the kirim Malai nageswaran Temple and the muram kandaswami temple India and Sri Lanka have historically shared a\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Combine all the steps into one function**"
      ],
      "metadata": {
        "id": "XDlTllCSMOSE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from typing import Dict\n",
        "\n",
        "def get_context(inputs: Dict[str, str]) -> Dict[str, str]:\n",
        "    \"\"\"\n",
        "    Creates or loads a vector store for the video transcript and retrieves relevant chunks based on a query.\n",
        "\n",
        "    Args:\n",
        "        inputs (Dict[str, str]): A dictionary containing the following keys:\n",
        "            - 'video_url' (str): The YouTube video URL.\n",
        "            - 'query' (str): The user query.\n",
        "            - 'db_path' (str): Path to the vector database.\n",
        "\n",
        "    Returns:\n",
        "        Dict[str, str]: A dictionary containing:\n",
        "            - 'context' (str): Extracted relevant context.\n",
        "            - 'query' (str): The user query.\n",
        "    \"\"\"\n",
        "    video_url, query, db_path  = inputs['video_url'], inputs['query'], inputs['db_path']\n",
        "\n",
        "    # Create new vector store if it does not exist\n",
        "    if not os.path.exists(db_path):\n",
        "        print(\"Creating a new vector store...\\n\")\n",
        "        transcript = yt_transcript(video_url)\n",
        "        chunks = yt_chunk(transcript)\n",
        "        db = create_vector_store(chunks, db_path)\n",
        "\n",
        "    # Load the existing vector store\n",
        "    else:\n",
        "        print(\"Loading the existing vector store\\n\")\n",
        "        embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
        "        db = Chroma(persist_directory=db_path, embedding_function=embedding_model)\n",
        "\n",
        "    relevant_chunks = retrieve_context(db, query)\n",
        "    context = build_context(relevant_chunks)\n",
        "\n",
        "    return {'context': context, 'query': query}"
      ],
      "metadata": {
        "id": "cPKy-j4BMgHo"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Build RAG chain**"
      ],
      "metadata": {
        "id": "O2MVf4vxM4sv"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "template = \"\"\" You are an AI model trained for question answering. You should answer the\n",
        "  given question based on the given context only.\n",
        "  Question : {query}\n",
        "  \\n\n",
        "  Context : {context}\n",
        "  \\n\n",
        "  If the answer is not present in the given context, respond as: The answer to this question is not available\n",
        "  in the provided content.\n",
        "  \"\"\"\n",
        "\n",
        "rag_prompt = ChatPromptTemplate.from_template(template)\n",
        "\n",
        "llm = ChatOpenAI(model='gpt-4o-mini')\n",
        "\n",
        "str_parser = StrOutputParser()\n",
        "\n",
        "rag_chain = (\n",
        "    RunnableLambda(get_context)\n",
        "    | rag_prompt\n",
        "    | llm\n",
        "    | str_parser\n",
        ")"
      ],
      "metadata": {
        "id": "x9KX2O3OM8Yu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## **Run RAG chain**"
      ],
      "metadata": {
        "id": "z-I63SUYNLT5"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Set the chroma DB path\n",
        "current_dir = \"/content/rag\"\n",
        "persistent_directory = os.path.join(current_dir, \"db\", \"chroma_db_yt\")"
      ],
      "metadata": {
        "id": "ZiqL9tnZNQ1v"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Video URL\n",
        "video_url = \"https://www.youtube.com/watch?v=d4IyR-kl_mY\""
      ],
      "metadata": {
        "id": "MWOhvXCJMNsK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Write the query\n",
        "query = 'passenger fery service is resumed from which place?'"
      ],
      "metadata": {
        "id": "0WIwZN65NJLW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "answer = rag_chain.invoke({'video_url':video_url, 'query':query, 'db_path':persistent_directory})"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "FkmSBhOJNKeC",
        "outputId": "9f8b4eb6-ff5d-4b66-9e62-65bc1d9a4cef"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Loading the existing vector store\n",
            "\n",
            "Relevant chunks are retrieved...\n",
            "\n",
            "Context is built from relevant chunks\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(f\"Query:{query}\\n\")\n",
        "print(f\"Generated answer:{answer}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gyPrNA7iNSiR",
        "outputId": "6dd89c6e-a4f4-4126-c40a-df808708168e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Query:passenger fery service is resumed from which place?\n",
            "\n",
            "Generated answer:The passenger ferry service is resumed from Nagapattinam in Tamil Nadu.\n"
          ]
        }
      ]
    }
  ]
}